{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import boto3\n",
    "import sagemaker\n",
    "import pandas as pd\n",
    "\n",
    "sess = sagemaker.Session()\n",
    "bucket = sess.default_bucket()\n",
    "role = sagemaker.get_execution_role()\n",
    "region = boto3.Session().region_name\n",
    "\n",
    "sm = boto3.Session().client(service_name=\"sagemaker\", region_name=region)\n",
    "iam = boto3.Session().client(service_name=\"iam\", region_name=region)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "%store -r processed_train_data_s3_uri"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "try:\n",
    "    processed_train_data_s3_uri\n",
    "except NameError:\n",
    "    print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
    "    print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n",
    "    print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "s3://sagemaker-us-east-1-835319576252/sagemaker-scikit-learn-2020-12-18-20-01-09-904/output/bert-train\n"
     ]
    }
   ],
   "source": [
    "print(processed_train_data_s3_uri)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "%store -r processed_validation_data_s3_uri"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "try:\n",
    "    processed_validation_data_s3_uri\n",
    "except NameError:\n",
    "    print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
    "    print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n",
    "    print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "s3://sagemaker-us-east-1-835319576252/sagemaker-scikit-learn-2020-12-18-20-01-09-904/output/bert-validation\n"
     ]
    }
   ],
   "source": [
    "print(processed_validation_data_s3_uri)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "%store -r processed_test_data_s3_uri"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "try:\n",
    "    processed_test_data_s3_uri\n",
    "except NameError:\n",
    "    print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")\n",
    "    print(\"[ERROR] Please run the notebooks in the PREPARE section before you continue.\")\n",
    "    print(\"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "s3://sagemaker-us-east-1-835319576252/sagemaker-scikit-learn-2020-12-18-20-01-09-904/output/bert-test\n"
     ]
    }
   ],
   "source": [
    "print(processed_test_data_s3_uri)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "%store -r max_seq_length"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "64\n"
     ]
    }
   ],
   "source": [
    "print(max_seq_length)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Specify the Dataset in S3\n",
    "We are using the train, validation, and test splits created in the previous section."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "s3://sagemaker-us-east-1-835319576252/sagemaker-scikit-learn-2020-12-18-20-01-09-904/output/bert-train\n",
      "2020-12-18 20:07:58     352211 part-algo-1-amazon_reviews_us_Digital_Ebook_Purchase_v1_01.tfrecord\n",
      "2020-12-18 20:07:58      11710 part-algo-1-amazon_reviews_us_Digital_Video_Games_v1_00.tfrecord\n",
      "2020-12-18 20:06:28      10676 part-algo-2-amazon_reviews_us_Digital_Software_v1_00.tfrecord\n"
     ]
    }
   ],
   "source": [
    "print(processed_train_data_s3_uri)\n",
    "\n",
    "!aws s3 ls $processed_train_data_s3_uri/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "s3://sagemaker-us-east-1-835319576252/sagemaker-scikit-learn-2020-12-18-20-01-09-904/output/bert-validation\n",
      "2020-12-18 20:07:58      20135 part-algo-1-amazon_reviews_us_Digital_Ebook_Purchase_v1_01.tfrecord\n",
      "2020-12-18 20:07:58        657 part-algo-1-amazon_reviews_us_Digital_Video_Games_v1_00.tfrecord\n",
      "2020-12-18 20:06:28        630 part-algo-2-amazon_reviews_us_Digital_Software_v1_00.tfrecord\n"
     ]
    }
   ],
   "source": [
    "print(processed_validation_data_s3_uri)\n",
    "\n",
    "!aws s3 ls $processed_validation_data_s3_uri/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "s3://sagemaker-us-east-1-835319576252/sagemaker-scikit-learn-2020-12-18-20-01-09-904/output/bert-test\n",
      "2020-12-18 20:07:59      19737 part-algo-1-amazon_reviews_us_Digital_Ebook_Purchase_v1_01.tfrecord\n",
      "2020-12-18 20:07:59        648 part-algo-1-amazon_reviews_us_Digital_Video_Games_v1_00.tfrecord\n",
      "2020-12-18 20:06:29        665 part-algo-2-amazon_reviews_us_Digital_Software_v1_00.tfrecord\n"
     ]
    }
   ],
   "source": [
    "print(processed_test_data_s3_uri)\n",
    "\n",
    "!aws s3 ls $processed_test_data_s3_uri/"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Specify S3 `Distribution Strategy`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-east-1-835319576252/sagemaker-scikit-learn-2020-12-18-20-01-09-904/output/bert-train', 'S3DataDistributionType': 'ShardedByS3Key'}}}\n",
      "{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-east-1-835319576252/sagemaker-scikit-learn-2020-12-18-20-01-09-904/output/bert-validation', 'S3DataDistributionType': 'ShardedByS3Key'}}}\n",
      "{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-east-1-835319576252/sagemaker-scikit-learn-2020-12-18-20-01-09-904/output/bert-test', 'S3DataDistributionType': 'ShardedByS3Key'}}}\n"
     ]
    }
   ],
   "source": [
    "from sagemaker.inputs import TrainingInput\n",
    "\n",
    "s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, distribution=\"ShardedByS3Key\")\n",
    "s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, distribution=\"ShardedByS3Key\")\n",
    "s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, distribution=\"ShardedByS3Key\")\n",
    "\n",
    "print(s3_input_train_data.config)\n",
    "print(s3_input_validation_data.config)\n",
    "print(s3_input_test_data.config)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Setup Hyper-Parameters for Classification Layer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "64\n"
     ]
    }
   ],
   "source": [
    "print(max_seq_length)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "epochs = 1\n",
    "learning_rate = 0.00001\n",
    "epsilon = 0.00000001\n",
    "train_batch_size = 128\n",
    "validation_batch_size = 128\n",
    "test_batch_size = 128\n",
    "train_steps_per_epoch = 100\n",
    "validation_steps = 100\n",
    "test_steps = 100\n",
    "train_instance_count = 1\n",
    "train_instance_type = \"ml.c5.9xlarge\"\n",
    "train_volume_size = 1024\n",
    "use_xla = True\n",
    "use_amp = True\n",
    "freeze_bert_layer = False\n",
    "enable_sagemaker_debugger = True\n",
    "enable_checkpointing = False\n",
    "enable_tensorboard = False\n",
    "input_mode = \"Pipe\"\n",
    "run_validation = True\n",
    "run_test = True\n",
    "run_sample_predictions = True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "metrics_definitions = [\n",
    "    {\"Name\": \"train:loss\", \"Regex\": \"loss: ([0-9\\\\.]+)\"},\n",
    "    {\"Name\": \"train:accuracy\", \"Regex\": \"accuracy: ([0-9\\\\.]+)\"},\n",
    "    {\"Name\": \"validation:loss\", \"Regex\": \"val_loss: ([0-9\\\\.]+)\"},\n",
    "    {\"Name\": \"validation:accuracy\", \"Regex\": \"val_accuracy: ([0-9\\\\.]+)\"},\n",
    "]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Setup Our BERT + TensorFlow Script to Run on SageMaker\n",
    "Prepare our TensorFlow model to run on the managed SageMaker service"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sagemaker.tensorflow import TensorFlow\n",
    "\n",
    "estimator = TensorFlow(\n",
    "    entry_point=\"tf_bert_reviews.py\",\n",
    "    source_dir=\"src\",\n",
    "    role=role,\n",
    "    instance_count=train_instance_count,\n",
    "    instance_type=train_instance_type,\n",
    "    volume_size=train_volume_size,\n",
    "    #                        use_spot_instances=True,\n",
    "    #                        max_wait=7200, # Seconds to wait for spot instances to become available\n",
    "    py_version=\"py3\",\n",
    "    framework_version=\"2.1.0\",\n",
    "    hyperparameters={\n",
    "        \"epochs\": epochs,\n",
    "        \"learning_rate\": learning_rate,\n",
    "        \"epsilon\": epsilon,\n",
    "        \"train_batch_size\": train_batch_size,\n",
    "        \"validation_batch_size\": validation_batch_size,\n",
    "        \"test_batch_size\": test_batch_size,\n",
    "        \"train_steps_per_epoch\": train_steps_per_epoch,\n",
    "        \"validation_steps\": validation_steps,\n",
    "        \"test_steps\": test_steps,\n",
    "        \"use_xla\": use_xla,\n",
    "        \"use_amp\": use_amp,\n",
    "        \"max_seq_length\": max_seq_length,\n",
    "        \"freeze_bert_layer\": freeze_bert_layer,\n",
    "        \"enable_sagemaker_debugger\": enable_sagemaker_debugger,\n",
    "        \"enable_checkpointing\": enable_checkpointing,\n",
    "        \"enable_tensorboard\": enable_tensorboard,\n",
    "        \"run_validation\": run_validation,\n",
    "        \"run_test\": run_test,\n",
    "        \"run_sample_predictions\": run_sample_predictions,\n",
    "    },\n",
    "    input_mode=input_mode,\n",
    "    enable_network_isolation=True,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Verify Training Job Failed Due to Container Network Isolation\n",
    "\n",
    "```\n",
    "botocore.exceptions.NoCredentialsError: Unable to locate credentials\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "estimator.fit(\n",
    "    inputs={\"train\": s3_input_train_data, \"validation\": s3_input_validation_data, \"test\": s3_input_test_data},\n",
    "    wait=False,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training Job Name:  tensorflow-training-2020-12-20-18-23-58-828\n"
     ]
    }
   ],
   "source": [
    "training_job_name = estimator.latest_training_job.name\n",
    "print(\"Training Job Name:  {}\".format(training_job_name))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<b>Review <a target=\"blank\" href=\"https://console.aws.amazon.com/sagemaker/home?region=us-east-1#/jobs/tensorflow-training-2020-12-20-18-23-58-828\">Training Job</a> After About 5 Minutes</b>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from IPython.core.display import display, HTML\n",
    "\n",
    "display(\n",
    "    HTML(\n",
    "        '<b>Review <a target=\"blank\" href=\"https://console.aws.amazon.com/sagemaker/home?region={}#/jobs/{}\">Training Job</a> After About 5 Minutes</b>'.format(\n",
    "            region, training_job_name\n",
    "        )\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<b>Review <a target=\"blank\" href=\"https://console.aws.amazon.com/cloudwatch/home?region=us-east-1#logStream:group=/aws/sagemaker/TrainingJobs;prefix=tensorflow-training-2020-12-20-18-23-58-828;streamFilter=typeLogStreamPrefix\">CloudWatch Logs</a> After About 5 Minutes</b>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from IPython.core.display import display, HTML\n",
    "\n",
    "display(\n",
    "    HTML(\n",
    "        '<b>Review <a target=\"blank\" href=\"https://console.aws.amazon.com/cloudwatch/home?region={}#logStream:group=/aws/sagemaker/TrainingJobs;prefix={};streamFilter=typeLogStreamPrefix\">CloudWatch Logs</a> After About 5 Minutes</b>'.format(\n",
    "            region, training_job_name\n",
    "        )\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<b>Review <a target=\"blank\" href=\"https://s3.console.aws.amazon.com/s3/buckets/sagemaker-us-east-1-835319576252/tensorflow-training-2020-12-20-18-23-58-828/?region=us-east-1&tab=overview\">S3 Output Data</a> After The Training Job Has Completed</b>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from IPython.core.display import display, HTML\n",
    "\n",
    "display(\n",
    "    HTML(\n",
    "        '<b>Review <a target=\"blank\" href=\"https://s3.console.aws.amazon.com/s3/buckets/{}/{}/?region={}&tab=overview\">S3 Output Data</a> After The Training Job Has Completed</b>'.format(\n",
    "            bucket, training_job_name, region\n",
    "        )\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "2020-12-20 18:24:00 Starting - Starting the training job\n",
      "2020-12-20 18:24:02 Starting - Launching requested ML instances................\n",
      "2020-12-20 18:25:30 Starting - Preparing the instances for training...........\n",
      "2020-12-20 18:26:30 Downloading - Downloading input data.\n",
      "2020-12-20 18:26:37 Training - Downloading the training image.\n",
      "2020-12-20 18:26:51 Training - Training image download completed. Training in progress..\n",
      "2020-12-20 18:26:59 Uploading - Uploading generated training model\n",
      "2020-12-20 18:26:59 Failed - Training job failed\n"
     ]
    },
    {
     "ename": "UnexpectedStatusException",
     "evalue": "Error for Training job tensorflow-training-2020-12-20-18-23-58-828: Failed. Reason: AlgorithmError: framework error: \nTraceback (most recent call last):\n  File \"/usr/local/lib/python3.6/dist-packages/sagemaker_containers/_trainer.py\", line 84, in train\n    entrypoint()\n  File \"/usr/local/lib/python3.6/dist-packages/sagemaker_tensorflow_container/training.py\", line 210, in main\n    s3_utils.configure(user_hyperparameters.get('model_dir'), os.environ.get('SAGEMAKER_REGION'))\n  File \"/usr/local/lib/python3.6/dist-packages/sagemaker_tensorflow_container/s3_utils.py\", line 23, in configure\n    os.environ['S3_REGION'] = _s3_region(job_region, model_dir)\n  File \"/usr/local/lib/python3.6/dist-packages/sagemaker_tensorflow_container/s3_utils.py\", line 39, in _s3_region\n    bucket_location = s3.get_bucket_location(Bucket=bucket_name)['LocationConstraint']\n  File \"/usr/local/lib/python3.6/dist-packages/botocore/client.py\", line 316, in _api_call\n    return self._make_api_call(operation_name, kwargs)\n  File \"/usr/local/lib/python3.6/dist-packages/botocore/client.py\", line 613, in _make_api_call\n    oper",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mUnexpectedStatusException\u001b[0m                 Traceback (most recent call last)",
      "\u001b[0;32m<timed eval>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/site-packages/sagemaker/estimator.py\u001b[0m in \u001b[0;36mwait\u001b[0;34m(self, logs)\u001b[0m\n\u001b[1;32m   1600\u001b[0m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msagemaker_session\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlogs_for_job\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjob_name\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwait\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlog_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlogs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1601\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1602\u001b[0;31m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msagemaker_session\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait_for_job\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjob_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1603\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1604\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mdescribe\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/site-packages/sagemaker/session.py\u001b[0m in \u001b[0;36mwait_for_job\u001b[0;34m(self, job, poll)\u001b[0m\n\u001b[1;32m   3114\u001b[0m             \u001b[0;32mlambda\u001b[0m \u001b[0mlast_desc\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0m_train_done\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msagemaker_client\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mjob\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlast_desc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpoll\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   3115\u001b[0m         )\n\u001b[0;32m-> 3116\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_check_job_status\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjob\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdesc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"TrainingJobStatus\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   3117\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mdesc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   3118\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.6/site-packages/sagemaker/session.py\u001b[0m in \u001b[0;36m_check_job_status\u001b[0;34m(self, job, desc, status_key_name)\u001b[0m\n\u001b[1;32m   3265\u001b[0m                 ),\n\u001b[1;32m   3266\u001b[0m                 \u001b[0mallowed_statuses\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"Completed\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Stopped\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3267\u001b[0;31m                 \u001b[0mactual_status\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstatus\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   3268\u001b[0m             )\n\u001b[1;32m   3269\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mUnexpectedStatusException\u001b[0m: Error for Training job tensorflow-training-2020-12-20-18-23-58-828: Failed. Reason: AlgorithmError: framework error: \nTraceback (most recent call last):\n  File \"/usr/local/lib/python3.6/dist-packages/sagemaker_containers/_trainer.py\", line 84, in train\n    entrypoint()\n  File \"/usr/local/lib/python3.6/dist-packages/sagemaker_tensorflow_container/training.py\", line 210, in main\n    s3_utils.configure(user_hyperparameters.get('model_dir'), os.environ.get('SAGEMAKER_REGION'))\n  File \"/usr/local/lib/python3.6/dist-packages/sagemaker_tensorflow_container/s3_utils.py\", line 23, in configure\n    os.environ['S3_REGION'] = _s3_region(job_region, model_dir)\n  File \"/usr/local/lib/python3.6/dist-packages/sagemaker_tensorflow_container/s3_utils.py\", line 39, in _s3_region\n    bucket_location = s3.get_bucket_location(Bucket=bucket_name)['LocationConstraint']\n  File \"/usr/local/lib/python3.6/dist-packages/botocore/client.py\", line 316, in _api_call\n    return self._make_api_call(operation_name, kwargs)\n  File \"/usr/local/lib/python3.6/dist-packages/botocore/client.py\", line 613, in _make_api_call\n    oper"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "estimator.latest_training_job.wait(logs=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# [INFO] _Feel free to continue to the next workshop section while this notebook is running._"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "fatal error: An error occurred (404) when calling the HeadObject operation: Key \"tensorflow-training-2020-12-20-18-23-58-828/output/model.tar.gz\" does not exist\n"
     ]
    }
   ],
   "source": [
    "!aws s3 cp s3://$bucket/$training_job_name/output/model.tar.gz ./model.tar.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tar (child): ./model.tar.gz: Cannot open: No such file or directory\n",
      "tar (child): Error is not recoverable: exiting now\n",
      "tar: Child returned status 2\n",
      "tar: Error is not recoverable: exiting now\n"
     ]
    }
   ],
   "source": [
    "!mkdir -p ./model/\n",
    "!tar -xvzf ./model.tar.gz -C ./model/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2020-12-20 18:27:04.407046: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libnvinfer.so.6'; dlerror: libnvinfer.so.6: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-10.0/lib64:/usr/local/cuda-10.0/extras/CUPTI/lib64:/usr/local/cuda-10.0/lib:/usr/local/cuda-10.0/efa/lib:/opt/amazon/efa/lib:/opt/amazon/efa/lib64:/usr/lib64/openmpi/lib/:/usr/local/lib:/usr/lib:/usr/local/mpi/lib:/lib/:/usr/lib64/openmpi/lib/:/usr/local/lib:/usr/lib:/usr/local/mpi/lib:/lib/:/usr/lib64/openmpi/lib/:/usr/local/lib:/usr/lib:/usr/local/mpi/lib:/lib/:\n",
      "2020-12-20 18:27:04.407123: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libnvinfer_plugin.so.6'; dlerror: libnvinfer_plugin.so.6: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-10.0/lib64:/usr/local/cuda-10.0/extras/CUPTI/lib64:/usr/local/cuda-10.0/lib:/usr/local/cuda-10.0/efa/lib:/opt/amazon/efa/lib:/opt/amazon/efa/lib64:/usr/lib64/openmpi/lib/:/usr/local/lib:/usr/lib:/usr/local/mpi/lib:/lib/:/usr/lib64/openmpi/lib/:/usr/local/lib:/usr/lib:/usr/local/mpi/lib:/lib/:/usr/lib64/openmpi/lib/:/usr/local/lib:/usr/lib:/usr/local/mpi/lib:/lib/:\n",
      "2020-12-20 18:27:04.407136: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:30] Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n",
      "Traceback (most recent call last):\n",
      "  File \"/home/ec2-user/anaconda3/envs/python3/bin/saved_model_cli\", line 8, in <module>\n",
      "    sys.exit(main())\n",
      "  File \"/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/tensorflow_core/python/tools/saved_model_cli.py\", line 990, in main\n",
      "    args.func(args)\n",
      "  File \"/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/tensorflow_core/python/tools/saved_model_cli.py\", line 691, in show\n",
      "    _show_all(args.dir)\n",
      "  File \"/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/tensorflow_core/python/tools/saved_model_cli.py\", line 272, in _show_all\n",
      "    tag_sets = saved_model_utils.get_saved_model_tag_sets(saved_model_dir)\n",
      "  File \"/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/tensorflow_core/python/tools/saved_model_utils.py\", line 88, in get_saved_model_tag_sets\n",
      "    saved_model = read_saved_model(saved_model_dir)\n",
      "  File \"/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/tensorflow_core/python/tools/saved_model_utils.py\", line 55, in read_saved_model\n",
      "    raise IOError(\"SavedModel file does not exist at: %s\" % saved_model_dir)\n",
      "OSError: SavedModel file does not exist at: ./model/tensorflow/saved_model/0/\n"
     ]
    }
   ],
   "source": [
    "!saved_model_cli show --all --dir ./model/tensorflow/saved_model/0/"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Release Resources"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%html\n",
    "\n",
    "<p><b>Shutting down your kernel for this notebook to release resources.</b></p>\n",
    "<button class=\"sm-command-button\" data-commandlinker-command=\"kernelmenu:shutdown\" style=\"display:none;\">Shutdown Kernel</button>\n",
    "        \n",
    "<script>\n",
    "try {\n",
    "    els = document.getElementsByClassName(\"sm-command-button\");\n",
    "    els[0].click();\n",
    "}\n",
    "catch(err) {\n",
    "    // NoOp\n",
    "}    \n",
    "</script>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%javascript\n",
    "\n",
    "try {\n",
    "    Jupyter.notebook.save_checkpoint();\n",
    "    Jupyter.notebook.session.delete();\n",
    "}\n",
    "catch(err) {\n",
    "    // NoOp\n",
    "}"
   ]
  }
 ],
 "metadata": {
  "instance_type": "ml.t3.medium",
  "kernelspec": {
   "display_name": "Python 3 (Data Science)",
   "language": "python",
   "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-1:081325390199:image/datascience-1.0"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
